home *** CD-ROM | disk | FTP | other *** search
- /* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */
- /* ./glimpse/index/filetype.c */
- /* --------------------------------------------------------------------------
- this function detect whether a given file is of special type
- which we do not want to index.
- if so, then return(1) else return (0).
- a file is said to be binary if more than 10% of character > 128
- in the sampled input.
- a file is a uuencoded file if (maybe after mail header), there is
- a "begin" followed by 3 digits, and no lower case character.
-
- statistics we are concerned of:
- 1) average word length: should not be greater than 10.
- 2) index density: (the number of different words v.s. number of words).
-
- -----------------------------------------------------------------------------*/
- #include "glimpse.h"
- #define SAMPLE_SIZE 8192
- #define WORD_THRESHOLD 18 /* the ratio between number of characters and
- delimiters (blanks or \n) above which the file is determined to be
- hqx or other non-natural language text */
-
- #if BG_DEBUG
- extern FILE *LOGFILE;
- #endif /*BG_DEBUG*/
- char *member[MAX_4K_HASH];
- int member_tag[MAX_4K_HASH];
- int file_id;
- extern char *getword();
- extern char INDEX_DIR[MAX_LINE_LEN];
-
- int
- filetype(name, dosuffix)
- char *name;
- int dosuffix;
- {
- unsigned char buffer[SAMPLE_SIZE+1];
- int num_read;
- int BINARY=0;
- int UUENCODED=0;
- int fd;
-
- if (!dosuffix) goto nosuffix;
- if (!strcmp(COMP_SUFFIX, &name[strlen(name)-strlen(COMP_SUFFIX)]))
- return 0;
- if (test_special_suffix(name)) {
- #if BG_DEBUG
- fprintf(LOGFILE, "special suffix: %s -- not indexing\n", name);
- #endif /*BG_DEBUG*/
- return 1;
- }
-
- nosuffix:
- if((fd = open(name, 0)) < 0) {
- /* This is the only thing the user might want to know: suppress other warnings */
- fprintf(stderr, "permission denied or non-existent file: %s\n", name);
- return(1);
- }
- if ((num_read = read(fd, buffer, SAMPLE_SIZE)) <= 0) {
- #if BG_DEBUG
- fprintf(LOGFILE, "no data: %s -- not indexing\n", name);
- #endif /*BG_DEBUG*/
- close(fd);
- return 1;
- }
-
- if (test_postscript(buffer, num_read)) {
- #if BG_DEBUG
- fprintf(LOGFILE, "postscript file: %s -- not indexing\n", name);
- #endif /*BG_DEBUG*/
- close(fd);
- return 1;
- }
-
- BINARY = test_binary(buffer, num_read);
- if(BINARY == ON) {
- #if BG_DEBUG
- fprintf(LOGFILE, "binary file: %s -- not indexing\n", name);
- #endif /*BG_DEBUG*/
- close(fd);
- return(1);
- }
-
- /* now check for uuencoded file */
- UUENCODED = test_uuencode(buffer, num_read);
- if(UUENCODED == ON) {
- #if BG_DEBUG
- fprintf(LOGFILE, "uuencoded file: %s -- not indexing\n", name);
- #endif /*BG_DEBUG*/
- close(fd);
- return(1);
- }
- if(heavy_index(name, buffer, num_read)) {
- #if BG_DEBUG
- fprintf(LOGFILE, "heavy index file: %s -- not indexing\n ", name);
- #endif /*BG_DEBUG*/
- close(fd);
- return(1);
- }
- if(hqx(name, buffer, num_read)) {
- #if BG_DEBUG
- fprintf(LOGFILE, "too few real words: %s -- not indexing\n", name);
- #endif /*BG_DEBUG*/
- close(fd);
- return(1);
- }
- close(fd);
- return(0);
- }
-
- /* ----------------------------------------------------------------------
- check for heavy index file.
- the function first test block 1 (of SAMPLE_SIZE bytes).
- the file is determined to be heavy index file if
- index_ratio > 0.9 and num_words > 500
- ???
- ---------------------------------------------------------------------- */
- heavy_index(name, buffer, num_read)
- char *name;
- char *buffer;
- int num_read;
- {
- char *buffer_end;
- int hash_value;
- int new_word_num=0;
- int word_num=0;
- char word[256];
-
- buffer_end = &buffer[num_read];
- while((buffer = getword(word, buffer, buffer_end, NULL)) < buffer_end) {
- if(word[0] == '\0') continue;
- word_num++;
- hash_value = hash4k(word, strlen(word));
- if(member_tag[hash_value] != file_id) {
- new_word_num++;
- member_tag[hash_value] = file_id;
- }
- }
- if(new_word_num * 100 >= word_num * 83 && word_num >= 500) return(1);
- #ifdef debug
- printf("%s: new_word_num=%d, word_num=%d\n", name, new_word_num, word_num);
- #endif
- return(0);
- }
-
- /* ----------------------------------------------------------------------
- check for hqx encoded files or other files with long lines,
- for example, postscript files, core files, and others.
- the function first test block 1 (of SAMPLE_SIZE bytes).
- the file is determined to be bad if the ratio of blanks or newlines
- is too small.
- ---------------------------------------------------------------------- */
-
- hqx(name, buffer, num_read)
- char *name;
- char *buffer;
- int num_read;
- {
- int i;
- char c;
- int sep=0;
- if (num_read < 2048) return(0) ;
- for (i=0; i < num_read ; i++) {
- c=buffer[i];
- if (c == '\n' || c == ' ' || c == '/') sep++;
- /* the '/' is for list of file names, including .name_list. */
- /* the \n is for lists of words, but should be excluded really so
- that dictionaries are excluded */
- }
- if (!sep) return(1);
- if (num_read/sep > WORD_THRESHOLD) return(1);
- else return(0);
- }
-